# -*- coding: utf-8 -*-
import sys
import os
import json
import random
import time
import pandas as pd


'''
当使用pandas读取大文本文件时，会由于内存不足产生MemoryError异常,可以设置分块读取的方式来解决
chunkSum:为传入参数，设置分块数，读取数据总数:chunkSum * 50
'''
initState = 0
chunkSum = 100
if len(sys.argv) > 1:
    chunkSum = int(sys.argv[1])

# file_name = "/worker/data/orderds.csv"
file_name = "/worker/data/orders.csv"
write_file = "/worker/data/flume_exec_test.txt"

cols = ["order_id", "user_id", "eval_set",
        "order_number", "order_dow", "hour", "day"]

with open(write_file, 'a+') as wrf:
    for chunk in pd.read_csv(file_name,  chunksize=50):
        if initState < chunkSum:
            initState += 1
            data = chunk.fillna(value=0)
            for index, row in data.iterrows():
                line = dict(zip(cols, [val for val in row]))
                dataStr = json.dumps(line)
                wrf.write(dataStr + "\n")

            rand_num = random.random()
            time.sleep(rand_num)
        else:
            print("=====================Read Completed=======================")
            break

